library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(tree)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(class)
library (glmnet)
## Loading required package: Matrix
## Loaded glmnet 3.0-2
library(corrplot)
## corrplot 0.84 loaded
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(tidyverse)
## Registered S3 method overwritten by 'cli':
## method from
## print.tree tree
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ tibble 3.0.1 ✔ purrr 0.3.4
## ✔ tidyr 1.0.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.5.0
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
library(cluster)
wine=read.csv("wine_white.csv")
summary(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.00 Min. : 9.0
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
## Median :0.04300 Median : 34.00 Median :134.0
## Mean :0.04577 Mean : 35.31 Mean :138.4
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
## Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.878
## 3rd Qu.:6.000
## Max. :9.000
head(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.0 0.27 0.36 20.7 0.045
## 2 6.3 0.30 0.34 1.6 0.049
## 3 8.1 0.28 0.40 6.9 0.050
## 4 7.2 0.23 0.32 8.5 0.058
## 5 7.2 0.23 0.32 8.5 0.058
## 6 8.1 0.28 0.40 6.9 0.050
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 45 170 1.0010 3.00 0.45 8.8
## 2 14 132 0.9940 3.30 0.49 9.5
## 3 30 97 0.9951 3.26 0.44 10.1
## 4 47 186 0.9956 3.19 0.40 9.9
## 5 47 186 0.9956 3.19 0.40 9.9
## 6 30 97 0.9951 3.26 0.44 10.1
## quality
## 1 6
## 2 6
## 3 6
## 4 6
## 5 6
## 6 6
pairs(quality~.,data=wine, main="simple scatterplot matrix")
corr=cor(wine)
corrplot(corr, method="circle")
wine$quality =as.factor(wine$quality)
wine %>%
mutate(quality = as.factor(quality)) %>%
select(-c(residual.sugar, free.sulfur.dioxide, total.sulfur.dioxide, chlorides)) %>%
ggpairs(aes(color = quality, alpha = 0.4),
columns = 1:7,
lower = list(continuous = "points"),
upper = list(continuous = "blank"),
axisLabels = "none", switch = "both")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(viridis)
## Loading required package: viridisLite
wine %>%
plot_ly(x=~alcohol,y=~volatile.acidity,z= ~sulphates, color=~quality, hoverinfo = 'text', colors = viridis(3),
text = ~paste('Quality:', quality,
'<br>Alcohol:', alcohol,
'<br>Volatile Acidity:', volatile.acidity,
'<br>sulphates:', sulphates)) %>%
add_markers(opacity = 0.8) %>%
layout(title = "3D Wine Quality",
annotations=list(yref='paper',xref="paper",y=1.05,x=1.1, text="quality",showarrow=F), scene = list(xaxis = list(title = 'Alcohol'),
yaxis = list(title = 'Volatile Acidity'),
zaxis = list(title = 'sulphates')))
#Distribution of wine quality ratings
ggplot(wine, aes(x = quality)) +
geom_bar(stat = "count",position = "dodge") + ggtitle("Distribution of Wine Quality Ratings") + theme_classic()
data = read.csv("wine_white.csv")
head(data)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.0 0.27 0.36 20.7 0.045
## 2 6.3 0.30 0.34 1.6 0.049
## 3 8.1 0.28 0.40 6.9 0.050
## 4 7.2 0.23 0.32 8.5 0.058
## 5 7.2 0.23 0.32 8.5 0.058
## 6 8.1 0.28 0.40 6.9 0.050
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 45 170 1.0010 3.00 0.45 8.8
## 2 14 132 0.9940 3.30 0.49 9.5
## 3 30 97 0.9951 3.26 0.44 10.1
## 4 47 186 0.9956 3.19 0.40 9.9
## 5 47 186 0.9956 3.19 0.40 9.9
## 6 30 97 0.9951 3.26 0.44 10.1
## quality
## 1 6
## 2 6
## 3 6
## 4 6
## 5 6
## 6 6
summary(data)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.00 Min. : 9.0
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
## Median :0.04300 Median : 34.00 Median :134.0
## Mean :0.04577 Mean : 35.31 Mean :138.4
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
## Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.878
## 3rd Qu.:6.000
## Max. :9.000
df = data.frame(scale(data[1:11])) #scale dataset
df = na.omit(df)
head(df)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 0.1720794 -0.08176155 0.21325843 2.8210611 -0.03535139
## 2 -0.6574340 0.21587359 0.04799622 -0.9446688 0.14773200
## 3 1.4756004 0.01745016 0.54378284 0.1002720 0.19350284
## 4 0.4090832 -0.47860841 -0.11726599 0.4157258 0.55966962
## 5 0.4090832 -0.47860841 -0.11726599 0.4157258 0.55966962
## 6 1.4756004 0.01745016 0.54378284 0.1002720 0.19350284
## free.sulfur.dioxide total.sulfur.dioxide density pH
## 1 0.5698734 0.7444890 2.331273996 -1.24679399
## 2 -1.2528907 -0.1496693 -0.009153237 0.73995309
## 3 -0.3121093 -0.9732363 0.358628185 0.47505348
## 4 0.6874711 1.1209768 0.525801559 0.01147916
## 5 0.6874711 1.1209768 0.525801559 0.01147916
## 6 -0.3121093 -0.9732363 0.358628185 0.47505348
## sulphates alcohol
## 1 -0.34914861 -1.3930102
## 2 0.00134171 -0.8241915
## 3 -0.43677119 -0.3366326
## 4 -0.78726151 -0.4991523
## 5 -0.78726151 -0.4991523
## 6 -0.43677119 -0.3366326
summary(df)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. :-3.61998 Min. :-1.9668 Min. :-2.7615 Min. :-1.1418
## 1st Qu.:-0.65743 1st Qu.:-0.6770 1st Qu.:-0.5304 1st Qu.:-0.9250
## Median :-0.06492 Median :-0.1810 Median :-0.1173 Median :-0.2349
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.52758 3rd Qu.: 0.4143 3rd Qu.: 0.4612 3rd Qu.: 0.6917
## Max. : 8.70422 Max. : 8.1528 Max. :10.9553 Max. :11.7129
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :-1.6831 Min. :-1.95848 Min. :-3.0439
## 1st Qu.:-0.4473 1st Qu.:-0.72370 1st Qu.:-0.7144
## Median :-0.1269 Median :-0.07691 Median :-0.1026
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.1935 3rd Qu.: 0.62867 3rd Qu.: 0.6739
## Max. :13.7417 Max. :14.91679 Max. : 7.0977
## density pH sulphates
## Min. :-2.31280 Min. :-3.10109 Min. :-2.3645
## 1st Qu.:-0.77063 1st Qu.:-0.65077 1st Qu.:-0.6996
## Median :-0.09608 Median :-0.05475 Median :-0.1739
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.69298 3rd Qu.: 0.60750 3rd Qu.: 0.5271
## Max. :15.02976 Max. : 4.18365 Max. : 5.1711
## alcohol
## Min. :-2.04309
## 1st Qu.:-0.82419
## Median :-0.09285
## Mean : 0.00000
## 3rd Qu.: 0.71974
## Max. : 2.99502
wss <- (nrow(df)-1)*sum(apply(df,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(df, centers=i)$withinss)
## Warning: did not converge in 10 iterations
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within-cluster sum of squares")
silhouette_score <- function(k){
km <- kmeans(df, centers = k, nstart=20, iter.max=50)
ss <- silhouette(km$cluster, dist(df))
mean(ss[, 3])
}
k <- 2:10
avg_sil <- sapply(k, silhouette_score)
plot(k, type='b', avg_sil, xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)
fviz_nbclust(df, kmeans, method='silhouette')
# compute gap statistic
set.seed(123)
gap_stat <- clusGap(df, FUN = kmeans, nstart = 25,
K.max = 10, B = 50, iter.max=50)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 244900)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 244900)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 244900)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25, iter.max = 50)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 2
## logW E.logW gap SE.sim
## [1,] 8.598970 9.886755 1.287785 0.002156290
## [2,] 8.475417 9.803630 1.328213 0.002382922
## [3,] 8.429626 9.753553 1.323927 0.002191498
## [4,] 8.399495 9.717269 1.317774 0.002405187
## [5,] 8.364682 9.695848 1.331166 0.002272476
## [6,] 8.338565 9.676000 1.337435 0.002228208
## [7,] 8.314816 9.658963 1.344147 0.002219907
## [8,] 8.295053 9.642652 1.347599 0.002360697
## [9,] 8.274388 9.629123 1.354735 0.002294227
## [10,] 8.260260 9.616375 1.356115 0.002315594
fviz_gap_stat(gap_stat)
set.seed(2020)
km.out1=kmeans(df,2,nstart=20)
km.out1$tot.withinss
## [1] 42539.96
fviz_cluster(km.out1, data = df)
## ramdomly select 5 rows from each quality degree
d3 = data[data$quality == 3, ]
d3 = d3[sample(nrow(d3), 5), ]
d3
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3266 4.2 0.215 0.23 5.1 0.041
## 1932 7.1 0.490 0.22 2.0 0.047
## 3410 6.2 0.230 0.35 0.7 0.051
## 1485 7.5 0.320 0.24 4.6 0.053
## 3088 6.1 0.200 0.34 9.5 0.041
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 3266 64.0 157.0 0.99688 3.42 0.44
## 1932 146.5 307.5 0.99240 3.24 0.37
## 3410 24.0 111.0 0.99160 3.37 0.43
## 1485 8.0 134.0 0.99580 3.14 0.50
## 3088 38.0 201.0 0.99500 3.14 0.44
## alcohol quality
## 3266 8.0 3
## 1932 11.0 3
## 3410 11.0 3
## 1485 9.1 3
## 3088 10.1 3
d4 = data[data$quality == 4, ]
d4 = d4[sample(nrow(d4), 5), ]
d4
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1703 6.5 0.27 0.19 4.2 0.046
## 2533 6.7 0.54 0.27 7.1 0.049
## 1060 5.3 0.30 0.20 1.1 0.077
## 907 8.3 0.27 0.45 1.3 0.048
## 3219 6.5 0.29 0.25 2.5 0.142
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 1703 6 114 0.99550 3.25 0.35
## 2533 8 178 0.99502 3.16 0.38
## 1060 48 166 0.99440 3.30 0.54
## 907 8 72 0.99440 3.08 0.61
## 3219 8 111 0.99270 3.00 0.44
## alcohol quality
## 1703 8.6 4
## 2533 9.4 4
## 1060 8.7 4
## 907 10.3 4
## 3219 9.9 4
d5 = data[data$quality == 5, ]
d5 = d5[sample(nrow(d5), 5), ]
d5
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 2594 6.3 0.21 0.29 11.7 0.048
## 430 7.1 0.31 0.47 13.6 0.056
## 785 7.2 0.23 0.19 13.7 0.052
## 1145 7.2 0.25 0.32 1.5 0.047
## 2540 6.2 0.27 0.18 1.5 0.028
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 2594 49 147 0.99482 3.22 0.38
## 430 54 197 0.99780 3.10 0.49
## 785 47 197 0.99865 3.12 0.53
## 1145 27 132 0.99330 3.26 0.44
## 2540 20 111 0.99228 3.41 0.50
## alcohol quality
## 2594 10.8 5
## 430 9.3 5
## 785 9.0 5
## 1145 10.5 5
## 2540 10.0 5
d6 = data[data$quality == 6, ]
d6 = d6[sample(nrow(d6), 5), ]
d6
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3224 8.2 0.30 0.44 12.4 0.043
## 2218 8.1 0.25 0.38 3.8 0.051
## 3978 6.9 0.27 0.25 7.5 0.030
## 1483 6.7 0.16 0.49 2.4 0.046
## 1920 7.3 0.22 0.50 13.7 0.049
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 3224 52 154 0.99452 3.04 0.33
## 2218 18 129 0.99280 3.21 0.38
## 3978 18 117 0.99116 3.09 0.38
## 1483 57 187 0.99520 3.62 0.81
## 1920 56 189 0.99940 3.24 0.66
## alcohol quality
## 3224 12.0 6
## 2218 11.5 6
## 3978 13.0 6
## 1483 10.4 6
## 1920 9.0 6
d7 = data[data$quality == 7, ]
d7 = d7[sample(nrow(d7), 5), ]
d7
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 600 6.9 0.19 0.40 1.40 0.036
## 2556 6.3 0.13 0.42 1.10 0.043
## 3597 6.6 0.17 0.28 1.10 0.034
## 352 7.3 0.33 0.40 6.85 0.038
## 2176 7.4 0.19 0.30 12.80 0.053
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 600 14.0 55 0.99090 3.08 0.68
## 2556 63.0 146 0.99066 3.13 0.72
## 3597 55.0 108 0.98939 3.00 0.52
## 352 32.0 138 0.99200 3.03 0.30
## 2176 48.5 212 0.99860 3.14 0.49
## alcohol quality
## 600 11.5 7
## 2556 11.2 7
## 3597 11.9 7
## 352 11.9 7
## 2176 9.1 7
d8 = data[data$quality == 8, ]
d8 = d8[sample(nrow(d8), 5), ]
d8
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 160 5.2 0.44 0.04 1.40 0.036
## 3462 6.7 0.24 0.30 3.85 0.042
## 331 6.4 0.32 0.35 4.80 0.030
## 3071 6.8 0.28 0.43 7.60 0.030
## 1633 7.1 0.28 0.49 6.50 0.041
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 160 43 119 0.98940 3.36 0.33
## 3462 105 179 0.99189 3.04 0.59
## 331 34 101 0.99120 3.36 0.60
## 3071 30 110 0.99164 3.08 0.59
## 1633 28 111 0.99260 3.41 0.58
## alcohol quality
## 160 12.1 8
## 3462 11.3 8
## 331 12.5 8
## 3071 12.5 8
## 1633 12.2 8
d9 = data[data$quality == 9, ]
d9 = d9[sample(nrow(d9), 5), ]
d9
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 828 7.4 0.24 0.36 2.0 0.031
## 877 6.9 0.36 0.34 4.2 0.018
## 821 6.6 0.36 0.29 1.6 0.021
## 775 9.1 0.27 0.45 10.6 0.035
## 1606 7.1 0.26 0.49 2.2 0.032
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 828 27 139 0.99055 3.28 0.48
## 877 57 119 0.98980 3.28 0.36
## 821 24 85 0.98965 3.41 0.61
## 775 28 124 0.99700 3.20 0.46
## 1606 31 113 0.99030 3.37 0.42
## alcohol quality
## 828 12.5 9
## 877 12.7 9
## 821 12.4 9
## 775 10.4 9
## 1606 12.9 9
new_df = rbind(d3,d4,d5,d6,d7,d8,d9)
new_df = new_df[1:11]
new_df
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 3266 4.2 0.215 0.23 5.10 0.041
## 1932 7.1 0.490 0.22 2.00 0.047
## 3410 6.2 0.230 0.35 0.70 0.051
## 1485 7.5 0.320 0.24 4.60 0.053
## 3088 6.1 0.200 0.34 9.50 0.041
## 1703 6.5 0.270 0.19 4.20 0.046
## 2533 6.7 0.540 0.27 7.10 0.049
## 1060 5.3 0.300 0.20 1.10 0.077
## 907 8.3 0.270 0.45 1.30 0.048
## 3219 6.5 0.290 0.25 2.50 0.142
## 2594 6.3 0.210 0.29 11.70 0.048
## 430 7.1 0.310 0.47 13.60 0.056
## 785 7.2 0.230 0.19 13.70 0.052
## 1145 7.2 0.250 0.32 1.50 0.047
## 2540 6.2 0.270 0.18 1.50 0.028
## 3224 8.2 0.300 0.44 12.40 0.043
## 2218 8.1 0.250 0.38 3.80 0.051
## 3978 6.9 0.270 0.25 7.50 0.030
## 1483 6.7 0.160 0.49 2.40 0.046
## 1920 7.3 0.220 0.50 13.70 0.049
## 600 6.9 0.190 0.40 1.40 0.036
## 2556 6.3 0.130 0.42 1.10 0.043
## 3597 6.6 0.170 0.28 1.10 0.034
## 352 7.3 0.330 0.40 6.85 0.038
## 2176 7.4 0.190 0.30 12.80 0.053
## 160 5.2 0.440 0.04 1.40 0.036
## 3462 6.7 0.240 0.30 3.85 0.042
## 331 6.4 0.320 0.35 4.80 0.030
## 3071 6.8 0.280 0.43 7.60 0.030
## 1633 7.1 0.280 0.49 6.50 0.041
## 828 7.4 0.240 0.36 2.00 0.031
## 877 6.9 0.360 0.34 4.20 0.018
## 821 6.6 0.360 0.29 1.60 0.021
## 775 9.1 0.270 0.45 10.60 0.035
## 1606 7.1 0.260 0.49 2.20 0.032
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
## 3266 64.0 157.0 0.99688 3.42 0.44
## 1932 146.5 307.5 0.99240 3.24 0.37
## 3410 24.0 111.0 0.99160 3.37 0.43
## 1485 8.0 134.0 0.99580 3.14 0.50
## 3088 38.0 201.0 0.99500 3.14 0.44
## 1703 6.0 114.0 0.99550 3.25 0.35
## 2533 8.0 178.0 0.99502 3.16 0.38
## 1060 48.0 166.0 0.99440 3.30 0.54
## 907 8.0 72.0 0.99440 3.08 0.61
## 3219 8.0 111.0 0.99270 3.00 0.44
## 2594 49.0 147.0 0.99482 3.22 0.38
## 430 54.0 197.0 0.99780 3.10 0.49
## 785 47.0 197.0 0.99865 3.12 0.53
## 1145 27.0 132.0 0.99330 3.26 0.44
## 2540 20.0 111.0 0.99228 3.41 0.50
## 3224 52.0 154.0 0.99452 3.04 0.33
## 2218 18.0 129.0 0.99280 3.21 0.38
## 3978 18.0 117.0 0.99116 3.09 0.38
## 1483 57.0 187.0 0.99520 3.62 0.81
## 1920 56.0 189.0 0.99940 3.24 0.66
## 600 14.0 55.0 0.99090 3.08 0.68
## 2556 63.0 146.0 0.99066 3.13 0.72
## 3597 55.0 108.0 0.98939 3.00 0.52
## 352 32.0 138.0 0.99200 3.03 0.30
## 2176 48.5 212.0 0.99860 3.14 0.49
## 160 43.0 119.0 0.98940 3.36 0.33
## 3462 105.0 179.0 0.99189 3.04 0.59
## 331 34.0 101.0 0.99120 3.36 0.60
## 3071 30.0 110.0 0.99164 3.08 0.59
## 1633 28.0 111.0 0.99260 3.41 0.58
## 828 27.0 139.0 0.99055 3.28 0.48
## 877 57.0 119.0 0.98980 3.28 0.36
## 821 24.0 85.0 0.98965 3.41 0.61
## 775 28.0 124.0 0.99700 3.20 0.46
## 1606 31.0 113.0 0.99030 3.37 0.42
## alcohol
## 3266 8.0
## 1932 11.0
## 3410 11.0
## 1485 9.1
## 3088 10.1
## 1703 8.6
## 2533 9.4
## 1060 8.7
## 907 10.3
## 3219 9.9
## 2594 10.8
## 430 9.3
## 785 9.0
## 1145 10.5
## 2540 10.0
## 3224 12.0
## 2218 11.5
## 3978 13.0
## 1483 10.4
## 1920 9.0
## 600 11.5
## 2556 11.2
## 3597 11.9
## 352 11.9
## 2176 9.1
## 160 12.1
## 3462 11.3
## 331 12.5
## 3071 12.5
## 1633 12.2
## 828 12.5
## 877 12.7
## 821 12.4
## 775 10.4
## 1606 12.9
hc.complete=hclust(dist(new_df), method="complete")
plot(hc.complete,main="Complete Linkage", xlab="index of selected row", sub="", cex =.9)
hc.average=hclust(dist(new_df), method="average")
plot(hc.average , main="Average Linkage", xlab="index of selected row", sub="", cex =.9)
hc.single=hclust(dist(new_df), method="single")
plot(hc.single , main="Single Linkage", xlab="index of selected row", sub="", cex =.9)
data = read.csv("wine_white.csv")
head(data)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.0 0.27 0.36 20.7 0.045
## 2 6.3 0.30 0.34 1.6 0.049
## 3 8.1 0.28 0.40 6.9 0.050
## 4 7.2 0.23 0.32 8.5 0.058
## 5 7.2 0.23 0.32 8.5 0.058
## 6 8.1 0.28 0.40 6.9 0.050
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 45 170 1.0010 3.00 0.45 8.8
## 2 14 132 0.9940 3.30 0.49 9.5
## 3 30 97 0.9951 3.26 0.44 10.1
## 4 47 186 0.9956 3.19 0.40 9.9
## 5 47 186 0.9956 3.19 0.40 9.9
## 6 30 97 0.9951 3.26 0.44 10.1
## quality
## 1 6
## 2 6
## 3 6
## 4 6
## 5 6
## 6 6
#summary(data)
set.seed(210)
table(data$quality)
##
## 3 4 5 6 7 8 9
## 20 163 1457 2198 880 175 5
### we can see there the data is imbalanced.
### to deal with this data imbalance we will use oversampling
data <- data[sample(1:nrow(data),nrow(data),prob = ifelse(data$quality == c("9","3"), 0.95 , 0.10), replace = TRUE),]
table(data$quality)
##
## 3 4 5 6 7 8 9
## 126 149 1399 2133 888 172 31
split into 3 clusters:
if quality < 5 -> low;
if 6 <= quality < 7 -> median;
if quality >= 7 -> high
library(dplyr)
library(caret)
library(tree)
data = data %>% mutate(quality_degree = case_when(quality >= 7 ~ 'High', quality >= 5 ~ 'Mid', TRUE ~ 'Low'))
data=data[, c(1:11, 13)]
data$quality_degree=as.factor(data$quality_degree)
summary(data)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.200 Min. :0.080 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.210 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.260 Median :0.3100 Median : 5.000
## Mean : 6.867 Mean :0.277 Mean :0.3325 Mean : 6.282
## 3rd Qu.: 7.300 3rd Qu.:0.320 3rd Qu.:0.3800 3rd Qu.: 9.700
## Max. :14.200 Max. :1.100 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.00 Min. : 10.0
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:107.0
## Median :0.04300 Median : 34.00 Median :134.0
## Mean :0.04519 Mean : 36.05 Mean :139.3
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:168.0
## Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9874 Min. :2.740 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.189 Mean :0.4919 Mean :10.54
## 3rd Qu.:0.9960 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0390 Max. :3.810 Max. :1.0800 Max. :14.20
## quality_degree
## High:1091
## Low : 275
## Mid :3532
##
##
##
# splid the data into training and testing
set.seed(13)
#dim(data)
training_index=sample(1:dim(data)[1], dim(data)[1]*0.8)
testing_index=-training_index
training_set=data[training_index, ]
testing_set=data[testing_index, ]
summary(training_set)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.200 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3100 Median : 4.900
## Mean : 6.871 Mean :0.2758 Mean :0.3316 Mean : 6.292
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3800 3rd Qu.: 9.800
## Max. :10.700 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.0 Min. : 10.0
## 1st Qu.:0.03600 1st Qu.: 23.0 1st Qu.:107.0
## Median :0.04300 Median : 34.0 Median :134.0
## Mean :0.04544 Mean : 36.1 Mean :139.4
## 3rd Qu.:0.05000 3rd Qu.: 46.0 3rd Qu.:168.0
## Max. :0.34600 Max. :289.0 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9874 Min. :2.740 Min. :0.250 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.080 1st Qu.:0.410 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.470 Median :10.40
## Mean :0.9940 Mean :3.189 Mean :0.492 Mean :10.52
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.550 3rd Qu.:11.40
## Max. :1.0390 Max. :3.810 Max. :1.080 Max. :14.20
## quality_degree
## High: 845
## Low : 224
## Mid :2849
##
##
##
set.seed(13)
#dim(data)
training_index=sample(1:dim(data)[1], dim(data)[1]*0.8)
testing_index=-training_index
training_set=data[training_index, ]
testing_set=data[testing_index, ]
summary(training_set)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.200 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3100 Median : 4.900
## Mean : 6.871 Mean :0.2758 Mean :0.3316 Mean : 6.292
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3800 3rd Qu.: 9.800
## Max. :10.700 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.0 Min. : 10.0
## 1st Qu.:0.03600 1st Qu.: 23.0 1st Qu.:107.0
## Median :0.04300 Median : 34.0 Median :134.0
## Mean :0.04544 Mean : 36.1 Mean :139.4
## 3rd Qu.:0.05000 3rd Qu.: 46.0 3rd Qu.:168.0
## Max. :0.34600 Max. :289.0 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9874 Min. :2.740 Min. :0.250 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.080 1st Qu.:0.410 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.470 Median :10.40
## Mean :0.9940 Mean :3.189 Mean :0.492 Mean :10.52
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.550 3rd Qu.:11.40
## Max. :1.0390 Max. :3.810 Max. :1.080 Max. :14.20
## quality_degree
## High: 845
## Low : 224
## Mid :2849
##
##
##
tree.wine = tree(quality_degree~., data=training_set)
summary(tree.wine)
##
## Classification tree:
## tree(formula = quality_degree ~ ., data = training_set)
## Variables actually used in tree construction:
## [1] "alcohol" "volatile.acidity" "free.sulfur.dioxide"
## [4] "residual.sugar" "total.sulfur.dioxide"
## Number of terminal nodes: 10
## Residual mean deviance: 1.153 = 4507 / 3908
## Misclassification error rate: 0.2422 = 949 / 3918
# the training error rate is 0.2422
# there are 10 terminal nodes
plot(tree.wine)
text(tree.wine, pretty=0)
set.seed(213)
pred = predict(tree.wine, testing_set, type = "class")
caret::confusionMatrix(pred, testing_set$quality_degree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low Mid
## High 44 0 30
## Low 1 10 6
## Mid 201 41 647
##
## Overall Statistics
##
## Accuracy : 0.7153
## 95% CI : (0.6859, 0.7434)
## No Information Rate : 0.6969
## P-Value [Acc > NIR] : 0.1115
##
## Kappa : 0.1817
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: High Class: Low Class: Mid
## Sensitivity 0.17886 0.19608 0.9473
## Specificity 0.95913 0.99247 0.1852
## Pos Pred Value 0.59459 0.58824 0.7278
## Neg Pred Value 0.77704 0.95742 0.6044
## Prevalence 0.25102 0.05204 0.6969
## Detection Rate 0.04490 0.01020 0.6602
## Detection Prevalence 0.07551 0.01735 0.9071
## Balanced Accuracy 0.56899 0.59427 0.5662
# accuracy: 0.752
set.seed(11)
cross_validation_tree =cv.tree(tree.wine ,FUN=prune.misclass )
names(cross_validation_tree)
## [1] "size" "dev" "k" "method"
cross_validation_tree # dev corresponds to the cross-validation error rate in this instance
## $size
## [1] 10 8 7 5 1
##
## $dev
## [1] 986 986 980 987 1069
##
## $k
## [1] -Inf 0.00 6.00 7.50 24.75
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cross_validation_tree$size,cross_validation_tree$dev ,type="b")
# We now apply the prune.misclass() function in order to prune the tree to prune.obtain the five-node tree.
# print out the best tree node size
cross_validation_tree$size[which.min(cross_validation_tree$dev)]
## [1] 7
prune.wine =prune.misclass (tree.wine, best =cross_validation_tree$size[which.min(cross_validation_tree$dev)])
plot(prune.wine)
text(prune.wine,pretty =0)
purn.pred=predict(prune.wine, testing_set, type="class")
# Calculate the accuracy:
caret::confusionMatrix(purn.pred, testing_set$quality_degree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low Mid
## High 44 0 30
## Low 1 8 4
## Mid 201 43 649
##
## Overall Statistics
##
## Accuracy : 0.7153
## 95% CI : (0.6859, 0.7434)
## No Information Rate : 0.6969
## P-Value [Acc > NIR] : 0.1115
##
## Kappa : 0.1755
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: High Class: Low Class: Mid
## Sensitivity 0.17886 0.156863 0.9502
## Specificity 0.95913 0.994618 0.1785
## Pos Pred Value 0.59459 0.615385 0.7268
## Neg Pred Value 0.77704 0.955533 0.6092
## Prevalence 0.25102 0.052041 0.6969
## Detection Rate 0.04490 0.008163 0.6622
## Detection Prevalence 0.07551 0.013265 0.9112
## Balanced Accuracy 0.56899 0.575740 0.5643
# Accoring to the results, the purned tree have same accuracy as the original tree
# using the purining method, we could make this tree less complicated while keeping the accuracy
set.seed(213)
rf1=randomForest(quality_degree ~.,data=training_set, mtry=4, importance=TRUE, ntree=500)
summary(rf1)
## Length Class Mode
## call 6 -none- call
## type 1 -none- character
## predicted 3918 factor numeric
## err.rate 2000 -none- numeric
## confusion 12 -none- numeric
## votes 11754 matrix numeric
## oob.times 3918 -none- numeric
## classes 3 -none- character
## importance 55 -none- numeric
## importanceSD 44 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 3918 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
predict_rf = predict(rf1, newdata = testing_set)
# calculate the accuracy and error rate
accuracy=mean(testing_set$quality_degree == predict_rf)
error=mean(testing_set$quality_degree != predict_rf)
# print out the accuracy and error rate
cat('accuracy is ', accuracy, "\n")
## accuracy is 0.9122449
cat('error rate is ', error)
## error rate is 0.0877551
set.seed(210)
rf2=randomForest(quality_degree ~.,data=training_set, mtry=11, importance=TRUE, ntree=500)
summary(rf2)
## Length Class Mode
## call 6 -none- call
## type 1 -none- character
## predicted 3918 factor numeric
## err.rate 2000 -none- numeric
## confusion 12 -none- numeric
## votes 11754 matrix numeric
## oob.times 3918 -none- numeric
## classes 3 -none- character
## importance 55 -none- numeric
## importanceSD 44 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 3918 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
predict_rf2 = predict(rf2, newdata = testing_set)
# calculate the accuracy and error rate
accuracy=mean(testing_set$quality_degree == predict_rf2)
error=mean(testing_set$quality_degree != predict_rf2)
# print out the accuracy and error rate
cat('accuracy is ', accuracy, "\n")
## accuracy is 0.9183673
cat('error rate is ', error)
## error rate is 0.08163265
importance(rf2)
## High Low Mid MeanDecreaseAccuracy
## fixed.acidity 59.13778 53.06202 61.35774 77.51511
## volatile.acidity 99.96748 73.21482 90.86874 126.71534
## citric.acid 58.36603 39.33573 55.18533 73.83397
## residual.sugar 61.51853 50.24703 55.27263 79.26795
## chlorides 72.55040 42.69326 59.12336 85.90796
## free.sulfur.dioxide 67.86080 91.70688 87.12803 115.48873
## total.sulfur.dioxide 58.63962 48.07500 56.18040 82.17094
## density 48.00448 31.67580 44.49665 62.53439
## pH 76.99637 44.20504 71.49911 90.68676
## sulphates 78.03483 45.83338 67.42694 89.77290
## alcohol 149.65674 107.43122 94.02303 195.73010
## MeanDecreaseGini
## fixed.acidity 129.2981
## volatile.acidity 132.9597
## citric.acid 112.0393
## residual.sugar 132.4474
## chlorides 124.3271
## free.sulfur.dioxide 195.8889
## total.sulfur.dioxide 130.6064
## density 119.4986
## pH 135.2123
## sulphates 141.0688
## alcohol 295.8018
varImpPlot(rf2)
library(dplyr)
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16.1
data = read.csv("wine_white.csv")
# step 1: resample the data
set.seed(213)
table(data$quality)
##
## 3 4 5 6 7 8 9
## 20 163 1457 2198 880 175 5
data <- data[sample(1:nrow(data),nrow(data),prob = ifelse(data$quality == c("9","3"), 0.95 , 0.10), replace = TRUE),]
table(data$quality)
##
## 3 4 5 6 7 8 9
## 106 140 1412 2183 850 176 31
# step 2: split the categorical labels into 2 group: Above_Average and Below_Average
data = data %>% mutate(quality_degree = case_when(quality >= 6 ~ 'Above_Average', TRUE ~ 'Below_Average'))
data=data[, c(1:11, 13)]
data$quality_degree=as.factor(data$quality_degree)
df_gam=data[, c("alcohol", "volatile.acidity","free.sulfur.dioxide", "quality_degree")]
summary(df_gam)
## alcohol volatile.acidity free.sulfur.dioxide quality_degree
## Min. : 8.00 Min. :0.0800 Min. : 2.00 Above_Average:3240
## 1st Qu.: 9.50 1st Qu.:0.2100 1st Qu.: 24.00 Below_Average:1658
## Median :10.40 Median :0.2600 Median : 34.00
## Mean :10.53 Mean :0.2772 Mean : 35.91
## 3rd Qu.:11.40 3rd Qu.:0.3200 3rd Qu.: 46.00
## Max. :14.20 Max. :1.0050 Max. :289.00
set.seed(13)
#dim(data)
training_index=sample(1:dim(df_gam)[1], dim(df_gam)[1]*0.8)
testing_index=-training_index
training_set=df_gam[training_index, ]
testing_set=df_gam[testing_index, ]
# summary(training_set)
df_parameter=data.frame(model=c(paste("gam", 2:6)), test.accuracy=NA)
for (mydegree in 2:6){
gam1=gam(I(quality_degree=='Above_Average') ~ s(alcohol,mydegree) + s(volatile.acidity, mydegree)+ s(free.sulfur.dioxide,mydegree), family=binomial, data=training_set)
# Evaluate the model
# Prediction
pred_gam <- predict(gam1, newdata=testing_set, type="response")
# Confusion Matrix
cm_gam <- as.matrix(table(Actual=testing_set$quality_degree, Predicted=pred_gam > 0.5))
cm_gam
# Accuracy
acc_gam <- sum(cm_gam[,2])/ sum(cm_gam)
cat("Accuracy: ", acc_gam)
df_parameter$test.accuracy[mydegree-1]=acc_gam
}
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy: 0.7397959
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy: 0.7142857
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy: 0.7071429
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy: 0.705102
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy: 0.7061224
df_parameter
## model test.accuracy
## 1 gam 2 0.7397959
## 2 gam 3 0.7142857
## 3 gam 4 0.7071429
## 4 gam 5 0.7051020
## 5 gam 6 0.7061224
gam1=gam(I(quality_degree=='Above_Average') ~ s(alcohol,2) + s(volatile.acidity, 2)+ s(free.sulfur.dioxide,2), family=binomial, data=training_set)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
summary(gam1)
##
## Call: gam(formula = I(quality_degree == "Above_Average") ~ s(alcohol,
## 2) + s(volatile.acidity, 2) + s(free.sulfur.dioxide, 2),
## family = binomial, data = training_set)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7577 -0.9496 0.4643 0.8240 2.0818
##
## (Dispersion Parameter for binomial family taken to be 1)
##
## Null Deviance: 5022.995 on 3917 degrees of freedom
## Residual Deviance: 4009.184 on 3911 degrees of freedom
## AIC: 4023.183
##
## Number of Local Scoring Iterations: 18
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(alcohol, 2) 1 425.9 425.88 442.642 < 2.2e-16 ***
## s(volatile.acidity, 2) 1 186.3 186.33 193.664 < 2.2e-16 ***
## s(free.sulfur.dioxide, 2) 1 28.7 28.73 29.857 4.942e-08 ***
## Residuals 3911 3762.9 0.96
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar Chisq P(Chi)
## (Intercept)
## s(alcohol, 2) 1 5.917 0.01499 *
## s(volatile.acidity, 2) 1 31.114 2.433e-08 ***
## s(free.sulfur.dioxide, 2) 1 119.203 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(3,3))
plot(gam1, se=T, col='red')
# Prediction
pred_gam <- predict(gam1, newdata=testing_set, type="response")
# Confusion Matrix
cm_gam <- as.matrix(table(Actual=testing_set$quality_degree, Predicted=pred_gam > 0.5))
cm_gam
## Predicted
## Actual FALSE TRUE
## Above_Average 87 567
## Below_Average 168 158
# Accuracy
acc_gam <- sum(cm_gam[,2])/ sum(cm_gam)
cat("Accuracy: ", acc_gam)
## Accuracy: 0.7397959
data = read.csv("wine_white.csv")
heatmap(cor(data))
# step 1: resample the data
set.seed(213)
table(data$quality)
##
## 3 4 5 6 7 8 9
## 20 163 1457 2198 880 175 5
data <- data[sample(1:nrow(data),nrow(data),prob = ifelse(data$quality == c("9","3"), 0.95 , 0.10), replace = TRUE),]
table(data$quality)
##
## 3 4 5 6 7 8 9
## 106 140 1412 2183 850 176 31
# step 2: split the categorical labels into 2 group: Above_Average and Below_Average
data = data %>% mutate(quality_degree = case_when(quality >= 6 ~ 'Above_Average', TRUE ~ 'Below_Average'))
data=data[, c(1:11, 13)]
data$quality_degree=as.factor(data$quality_degree)
df_gam=data
set.seed(13)
#dim(data)
training_index=sample(1:dim(df_gam)[1], dim(df_gam)[1]*0.8)
testing_index=-training_index
training_set=df_gam[training_index, ]
testing_set=df_gam[testing_index, ]
# summary(training_set)
df_parameter=data.frame(model=c(paste("gam", 2:6)), test.accuracy=NA)
for (mydegree in 2:6){
gam=gam(I(quality_degree=='Above_Average') ~ s(fixed.acidity, mydegree)+s(volatile.acidity, mydegree)+s(citric.acid, mydegree) + s(residual.sugar,mydegree)+s(chlorides,mydegree) +s( free.sulfur.dioxide, mydegree) +s(pH, mydegree) + s(sulphates,mydegree)+s(alcohol,mydegree), family=binomial, data=training_set)
# Evaluate the model
# Prediction
pred_gam <- predict(gam, newdata=testing_set, type="response")
# Confusion Matrix
cm_gam <- as.matrix(table(Actual=testing_set$quality_degree, Predicted=pred_gam > 0.5))
cm_gam
# Accuracy
acc_gam <- sum(cm_gam[,2])/ sum(cm_gam)
cat("Accuracy: ", acc_gam)
df_parameter$test.accuracy[mydegree-1]=acc_gam
}
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy: 0.7346939
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy: 0.7244898
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy: 0.7040816
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy: 0.7081633
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
## Accuracy: 0.7010204
df_parameter
## model test.accuracy
## 1 gam 2 0.7346939
## 2 gam 3 0.7244898
## 3 gam 4 0.7040816
## 4 gam 5 0.7081633
## 5 gam 6 0.7010204
gam2=gam(I(quality_degree=='Above_Average') ~ s(fixed.acidity, 2)+s(volatile.acidity, 2)+s(citric.acid, 2) + s(residual.sugar,2)+s(chlorides,2) +s(free.sulfur.dioxide, 2) +s(pH, 2) + s(sulphates,2)+s(alcohol,2), family=binomial, data=training_set)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts
## argument ignored
summary(gam2)
##
## Call: gam(formula = I(quality_degree == "Above_Average") ~ s(fixed.acidity,
## 2) + s(volatile.acidity, 2) + s(citric.acid, 2) + s(residual.sugar,
## 2) + s(chlorides, 2) + s(free.sulfur.dioxide, 2) + s(pH,
## 2) + s(sulphates, 2) + s(alcohol, 2), family = binomial,
## data = training_set)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9711 -0.8913 0.4395 0.8019 2.2180
##
## (Dispersion Parameter for binomial family taken to be 1)
##
## Null Deviance: 5022.995 on 3917 degrees of freedom
## Residual Deviance: 3877.554 on 3899 degrees of freedom
## AIC: 3915.553
##
## Number of Local Scoring Iterations: 15
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(fixed.acidity, 2) 1 12.3 12.34 12.7384 0.0003625 ***
## s(volatile.acidity, 2) 1 85.7 85.67 88.4664 < 2.2e-16 ***
## s(citric.acid, 2) 1 0.8 0.77 0.7994 0.3713353
## s(residual.sugar, 2) 1 15.9 15.87 16.3836 5.273e-05 ***
## s(chlorides, 2) 1 32.5 32.46 33.5182 7.616e-09 ***
## s(free.sulfur.dioxide, 2) 1 0.0 0.00 0.0002 0.9891921
## s(pH, 2) 1 1.7 1.73 1.7898 0.1810324
## s(sulphates, 2) 1 10.4 10.36 10.6975 0.0010821 **
## s(alcohol, 2) 1 456.4 456.37 471.2703 < 2.2e-16 ***
## Residuals 3899 3775.7 0.97
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar Chisq P(Chi)
## (Intercept)
## s(fixed.acidity, 2) 1 8.839 0.002948 **
## s(volatile.acidity, 2) 1 35.342 2.766e-09 ***
## s(citric.acid, 2) 1 8.761 0.003076 **
## s(residual.sugar, 2) 1 2.136 0.143889
## s(chlorides, 2) 1 21.572 3.408e-06 ***
## s(free.sulfur.dioxide, 2) 1 89.185 < 2.2e-16 ***
## s(pH, 2) 1 1.970 0.160380
## s(sulphates, 2) 1 3.705 0.054238 .
## s(alcohol, 2) 1 1.551 0.212991
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(3,4))
plot(gam2, se=T, col='red')
# Prediction
pred_gam <- predict(gam2, newdata=testing_set, type="response")
# Confusion Matrix
cm_gam <- as.matrix(table(Actual=testing_set$quality_degree, Predicted=pred_gam > 0.5))
cm_gam
## Predicted
## Actual FALSE TRUE
## Above_Average 82 572
## Below_Average 178 148
# Accuracy
acc_gam <- sum(cm_gam[,2])/ sum(cm_gam)
cat("Accuracy: ", acc_gam)
## Accuracy: 0.7346939
set.seed(1234)
wine1 <- read.csv("wine_white.csv")
wine.df <- as.data.frame(wine1)
wine1 = wine1 %>% mutate(quality = as.factor(quality))
table(wine1$quality)
##
## 3 4 5 6 7 8 9
## 20 163 1457 2198 880 175 5
### we can see there the data is imbalanced.
### to deal with this data imbalance we will use oversampling
wine1 <- wine1[sample(1:nrow(wine1),nrow(wine1),prob = ifelse(wine$quality == c("9","3"), 0.95 , 0.10), replace = TRUE),]
table(wine1$quality)
##
## 3 4 5 6 7 8 9
## 121 139 1417 2155 840 197 29
train_index = sample(1: nrow(wine1),0.8*nrow(wine1))
train = wine1[train_index,]
test = wine1[-train_index,]
###Lasso Regression to do predictor selection
x = model.matrix(quality ~. , wine1)
y = wine1$quality
y.test = y[-train_index]
x.test = x[-train_index,]
y.train = y[train_index]
x.train = x[train_index,]
lasso.mod=glmnet(x.train,y.train,alpha=1,family = "multinomial",type.measure = "class")
plot(lasso.mod)
set.seed(2111)
cv.out=cv.glmnet(x.train , y.train ,alpha=1 ,family = "multinomial",type.measure = "class")
plot(cv.out)
bestlam =cv.out$lambda.min
lasso.pred=predict (lasso.mod ,s= bestlam ,newx=x.test)
mean((lasso.pred -y.test)^2)
## Warning in Ops.factor(lasso.pred, y.test): '-' not meaningful for factors
## [1] NA
out=glmnet(x , y,alpha=1,family = "multinomial",type.measure = "class")
lasso.coef=predict (out ,type="coefficients",s=bestlam)
lasso.coef
## $`3`
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -2.038391e+02
## (Intercept) .
## fixed.acidity 5.598210e-01
## volatile.acidity 1.544162e+00
## citric.acid -1.522891e-01
## residual.sugar -1.069421e-01
## chlorides .
## free.sulfur.dioxide 1.521637e-02
## total.sulfur.dioxide 8.130362e-03
## density 7.998550e+01
## pH 1.017839e-01
## sulphates -5.320139e+00
## alcohol -2.527539e-02
##
## $`4`
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -2.866734e+02
## (Intercept) .
## fixed.acidity -3.558959e-01
## volatile.acidity 4.859123e+00
## citric.acid 2.868932e-01
## residual.sugar -2.010750e-01
## chlorides 3.441476e+00
## free.sulfur.dioxide -5.490122e-02
## total.sulfur.dioxide -1.506094e-03
## density 1.886115e+02
## pH -3.441663e+00
## sulphates -1.771338e+00
## alcohol -6.230058e-01
##
## $`5`
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.127979e+02
## (Intercept) .
## fixed.acidity -2.379319e-01
## volatile.acidity 7.425178e-01
## citric.acid 1.255643e-01
## residual.sugar -6.332946e-02
## chlorides 3.119120e-02
## free.sulfur.dioxide -8.886790e-03
## total.sulfur.dioxide 3.560308e-03
## density 1.437077e+01
## pH -2.868947e+00
## sulphates -1.683779e+00
## alcohol -8.317170e-01
##
## $`6`
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.057067e+02
## (Intercept) .
## fixed.acidity -2.998124e-01
## volatile.acidity -4.502309e+00
## citric.acid -4.225328e-01
## residual.sugar .
## chlorides -1.419198e+00
## free.sulfur.dioxide -1.405314e-03
## total.sulfur.dioxide 1.065751e-03
## density .
## pH -2.897014e+00
## sulphates 1.816807e-01
## alcohol 1.304029e-02
##
## $`7`
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 3.380622e+02
## (Intercept) .
## fixed.acidity 1.549179e-01
## volatile.acidity -7.172379e+00
## citric.acid -1.595888e+00
## residual.sugar 2.133992e-01
## chlorides -1.158916e+01
## free.sulfur.dioxide .
## total.sulfur.dioxide -3.274419e-04
## density -4.615320e+02
## pH .
## sulphates 2.333521e+00
## alcohol 7.265307e-02
##
## $`8`
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.723865e+02
## (Intercept) .
## fixed.acidity .
## volatile.acidity -7.156854e+00
## citric.acid .
## residual.sugar 2.407980e-01
## chlorides 5.484369e+00
## free.sulfur.dioxide 9.851367e-03
## total.sulfur.dioxide .
## density -4.010822e+02
## pH 4.629667e-02
## sulphates .
## alcohol 4.676187e-01
##
## $`9`
## 13 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 98.56832794
## (Intercept) .
## fixed.acidity 2.46338102
## volatile.acidity .
## citric.acid 0.40586922
## residual.sugar 0.37889867
## chlorides -289.67223963
## free.sulfur.dioxide 0.06032348
## total.sulfur.dioxide -0.03565604
## density -287.74588347
## pH 16.75761392
## sulphates 2.87349833
## alcohol .
######for each class
##### knn
list.k = c(1:20)
res.acc = c()
for (val in list.k) {
knn.pred=knn(x.train,x.test,y.train ,k=val)
acc = mean(knn.pred == y.test)
res.acc[val] = acc
}
plot(list.k,res.acc)
res.acc[1]
## [1] 0.7765306
###according to the graph, we should use k = 1 as the paremeter for the knn method which accuracy is 0.787755
#split data
library(caret)
set.seed(22202)
train=createDataPartition(wine$quality, p = 0.7, list = FALSE)
test =-train
winetrain = wine[train,]
winetest =wine[test,]
X.train = wine[train,1:11]
Y.train =wine[train,12]
X.test = wine[test,1:11]
Y.test = wine[test,12]
#svm
#kernal=radial
library(e1071)
svm_model= svm(quality ~ . , data = winetrain,kernel='radial')
svm_result = predict(svm_model, newdata = winetest[,!colnames(winetest) %in% c("quality")])
confusionMatrix(svm_result, winetest$quality)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 2 3 0 0 0 0
## 5 1 30 257 104 6 1 0
## 6 5 16 176 536 196 43 0
## 7 0 0 1 19 62 8 1
## 8 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.5842
## 95% CI : (0.5585, 0.6096)
## No Information Rate : 0.4492
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3184
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.00000 0.041667 0.5881 0.8134 0.23485 0.00000
## Specificity 1.00000 0.997886 0.8621 0.4604 0.97589 1.00000
## Pos Pred Value NaN 0.400000 0.6441 0.5514 0.68132 NaN
## Neg Pred Value 0.99591 0.968536 0.8315 0.7515 0.85320 0.96455
## Prevalence 0.00409 0.032720 0.2979 0.4492 0.17996 0.03545
## Detection Rate 0.00000 0.001363 0.1752 0.3654 0.04226 0.00000
## Detection Prevalence 0.00000 0.003408 0.2720 0.6626 0.06203 0.00000
## Balanced Accuracy 0.50000 0.519776 0.7251 0.6369 0.60537 0.50000
## Class: 9
## Sensitivity 0.0000000
## Specificity 1.0000000
## Pos Pred Value NaN
## Neg Pred Value 0.9993183
## Prevalence 0.0006817
## Detection Rate 0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy 0.5000000
#kernal=sigmoid
svm_model1= svm(quality ~ . , data = winetrain,kernel='sigmoid')
svm_result1 = predict(svm_model1, newdata = winetest[,!colnames(winetest) %in% c("quality")])
confusionMatrix(svm_result1, winetest$quality)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 2 14 6 0 0 0
## 5 1 24 210 180 27 3 0
## 6 4 18 181 353 145 27 0
## 7 0 4 32 119 88 21 1
## 8 1 0 0 1 4 1 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.4458
## 95% CI : (0.4202, 0.4717)
## No Information Rate : 0.4492
## P-Value [Acc > NIR] : 0.6133
##
## Kappa : 0.152
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7
## Sensitivity 0.00000 0.041667 0.4805 0.5357 0.33333
## Specificity 1.00000 0.985906 0.7718 0.5359 0.85287
## Pos Pred Value NaN 0.090909 0.4719 0.4849 0.33208
## Neg Pred Value 0.99591 0.968166 0.7779 0.5859 0.85358
## Prevalence 0.00409 0.032720 0.2979 0.4492 0.17996
## Detection Rate 0.00000 0.001363 0.1431 0.2406 0.05999
## Detection Prevalence 0.00000 0.014997 0.3033 0.4963 0.18064
## Balanced Accuracy 0.50000 0.513786 0.6262 0.5358 0.59310
## Class: 8 Class: 9
## Sensitivity 0.0192308 0.0000000
## Specificity 0.9957597 1.0000000
## Pos Pred Value 0.1428571 NaN
## Neg Pred Value 0.9650685 0.9993183
## Prevalence 0.0354465 0.0006817
## Detection Rate 0.0006817 0.0000000
## Detection Prevalence 0.0047716 0.0000000
## Balanced Accuracy 0.5074952 0.5000000
#kernal=poly
svm_model2= svm(quality ~ . , data = winetrain,kernel='polynomial')
svm_result2 = predict(svm_model2, newdata = winetest[,!colnames(winetest) %in% c("quality")])
confusionMatrix(svm_result2, winetest$quality)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 10 7 2 0 0 0
## 5 1 19 155 63 7 0 0
## 6 5 19 273 584 219 47 0
## 7 0 0 2 9 38 5 1
## 8 0 0 0 0 0 0 0
## 9 0 0 0 1 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.5365
## 95% CI : (0.5106, 0.5622)
## No Information Rate : 0.4492
## P-Value [Acc > NIR] : 1.277e-11
##
## Kappa : 0.2168
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.00000 0.208333 0.3547 0.8862 0.14394 0.00000
## Specificity 1.00000 0.993658 0.9126 0.3032 0.98587 1.00000
## Pos Pred Value NaN 0.526316 0.6327 0.5092 0.69091 NaN
## Neg Pred Value 0.99591 0.973757 0.7692 0.7656 0.83994 0.96455
## Prevalence 0.00409 0.032720 0.2979 0.4492 0.17996 0.03545
## Detection Rate 0.00000 0.006817 0.1057 0.3981 0.02590 0.00000
## Detection Prevalence 0.00000 0.012952 0.1670 0.7819 0.03749 0.00000
## Balanced Accuracy 0.50000 0.600995 0.6337 0.5947 0.56490 0.50000
## Class: 9
## Sensitivity 0.0000000
## Specificity 0.9993179
## Pos Pred Value 0.0000000
## Neg Pred Value 0.9993179
## Prevalence 0.0006817
## Detection Rate 0.0000000
## Detection Prevalence 0.0006817
## Balanced Accuracy 0.4996589
#kernal=linear
svm_model3= svm(quality ~ . , data = winetrain,kernel='linear')
svm_result3 = predict(svm_model3, newdata = winetest[,!colnames(winetest) %in% c("quality")])
confusionMatrix(svm_result3, winetest$quality)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 1 29 229 117 11 3 0
## 6 5 19 208 542 253 49 1
## 7 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.5256
## 95% CI : (0.4996, 0.5514)
## No Information Rate : 0.4492
## P-Value [Acc > NIR] : 2.704e-09
##
## Kappa : 0.1972
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.00000 0.00000 0.5240 0.8225 0.00 0.00000
## Specificity 1.00000 1.00000 0.8437 0.3379 1.00 1.00000
## Pos Pred Value NaN NaN 0.5872 0.5032 NaN NaN
## Neg Pred Value 0.99591 0.96728 0.8069 0.7000 0.82 0.96455
## Prevalence 0.00409 0.03272 0.2979 0.4492 0.18 0.03545
## Detection Rate 0.00000 0.00000 0.1561 0.3695 0.00 0.00000
## Detection Prevalence 0.00000 0.00000 0.2658 0.7342 0.00 0.00000
## Balanced Accuracy 0.50000 0.50000 0.6839 0.5802 0.50 0.50000
## Class: 9
## Sensitivity 0.0000000
## Specificity 1.0000000
## Pos Pred Value NaN
## Neg Pred Value 0.9993183
## Prevalence 0.0006817
## Detection Rate 0.0000000
## Detection Prevalence 0.0000000
## Balanced Accuracy 0.5000000
#multinomial logistic regression model
library(nnet)
str(winetrain)
## 'data.frame': 3431 obs. of 12 variables:
## $ fixed.acidity : num 7 7.2 7.2 8.1 6.2 7 6.3 8.1 8.6 6.6 ...
## $ volatile.acidity : num 0.27 0.23 0.23 0.28 0.32 0.27 0.3 0.22 0.23 0.17 ...
## $ citric.acid : num 0.36 0.32 0.32 0.4 0.16 0.36 0.34 0.43 0.4 0.38 ...
## $ residual.sugar : num 20.7 8.5 8.5 6.9 7 20.7 1.6 1.5 4.2 1.5 ...
## $ chlorides : num 0.045 0.058 0.058 0.05 0.045 0.045 0.049 0.044 0.035 0.032 ...
## $ free.sulfur.dioxide : num 45 47 47 30 30 45 14 28 17 28 ...
## $ total.sulfur.dioxide: num 170 186 186 97 136 170 132 129 109 112 ...
## $ density : num 1.001 0.996 0.996 0.995 0.995 ...
## $ pH : num 3 3.19 3.19 3.26 3.18 3 3.3 3.22 3.14 3.25 ...
## $ sulphates : num 0.45 0.4 0.4 0.44 0.47 0.45 0.49 0.45 0.53 0.55 ...
## $ alcohol : num 8.8 9.9 9.9 10.1 9.6 8.8 9.5 11 9.7 11.4 ...
## $ quality : Factor w/ 7 levels "3","4","5","6",..: 4 4 4 4 4 4 4 4 3 5 ...
multinomModel=multinom(quality ~ ., data = winetrain)
## # weights: 91 (72 variable)
## initial value 6676.417721
## iter 10 value 4504.508732
## iter 20 value 4328.247226
## iter 30 value 4181.580560
## iter 40 value 3772.920846
## iter 50 value 3738.364479
## iter 60 value 3730.892578
## iter 70 value 3728.356930
## iter 80 value 3727.062078
## iter 90 value 3725.710133
## iter 100 value 3724.778463
## final value 3724.778463
## stopped after 100 iterations
summary(multinomModel)
## Call:
## multinom(formula = quality ~ ., data = winetrain)
##
## Coefficients:
## (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4 -8.5644472 -0.7575642 1.020941 1.119383 0.005782688
## 5 -0.4297904 -1.1978646 -2.965519 1.382383 0.058605511
## 6 -3.9405082 -1.3281131 -8.643098 1.685138 0.112602600
## 7 28.4597424 -1.1830105 -10.549078 0.767677 0.152865511
## 8 10.3949788 -1.1383010 -10.334086 1.249730 0.205061803
## 9 -38.3144078 1.4503718 -13.830056 1.822825 0.283671177
## chlorides free.sulfur.dioxide total.sulfur.dioxide density pH
## 4 -10.82038 -0.092042259 0.013560405 19.81470 -1.066912
## 5 -11.70640 -0.055031198 0.016964964 24.71136 -3.246355
## 6 -11.69285 -0.047367193 0.014928454 20.52416 -2.994665
## 7 -31.53422 -0.038735252 0.012195765 -22.58238 -1.919272
## 8 -25.55608 -0.019816012 0.011299295 -14.47209 -1.182091
## 9 -47.73206 0.007218965 -0.001483365 -40.51331 11.797128
## sulphates alcohol
## 4 -0.6692671 0.11387371
## 5 -1.7145426 0.02841294
## 6 -0.3896408 0.87998174
## 7 0.9662256 1.41158756
## 8 0.6485456 1.81007572
## 9 -3.9754228 3.00644027
##
## Std. Errors:
## (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4 0.1261435 0.2492844 0.68353816 0.70178743 0.06521474
## 5 0.5111369 0.2385980 0.44570858 0.36419400 0.06110678
## 6 0.4297549 0.2387147 0.41812528 0.33162194 0.06107124
## 7 0.5495636 0.2435921 0.54927574 0.45840138 0.06190275
## 8 0.1081364 0.2584410 0.93185816 0.75779797 0.06392709
## 9 0.1666752 0.4944409 0.03573703 0.07221853 0.09643979
## chlorides free.sulfur.dioxide total.sulfur.dioxide density pH
## 4 0.06515046 0.01621825 0.008808593 0.1256969 0.4512610
## 5 0.96828679 0.01345030 0.008397584 0.5024334 0.4421693
## 6 0.95972836 0.01329874 0.008386623 0.4223883 0.4149276
## 7 0.05037647 0.01352164 0.008498867 0.5417329 0.4476471
## 8 0.01210080 0.01370128 0.009070917 0.1070391 0.4593520
## 9 0.01644169 0.03830606 0.022564035 0.1693621 1.0846948
## sulphates alcohol
## 4 0.7960803 0.2133457
## 5 0.4028178 0.1976410
## 6 0.3295562 0.1956629
## 7 0.3814666 0.1980105
## 8 0.6152199 0.2103405
## 9 0.2679084 0.3743991
##
## Residual Deviance: 7449.557
## AIC: 7593.557
#predict
predict_class=predict(multinomModel,winetest)
#Misclassification error
cm = table(predict_class, winetest$quality)
confusionMatrix(predict_class, winetest$quality)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 3 4 5 6 7 8 9
## 3 0 0 0 0 0 0 0
## 4 0 4 3 2 0 0 0
## 5 2 25 231 113 14 6 0
## 6 4 19 200 498 188 37 0
## 7 0 0 3 45 62 9 1
## 8 0 0 0 0 0 0 0
## 9 0 0 0 1 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.5419
## 95% CI : (0.516, 0.5677)
## No Information Rate : 0.4492
## P-Value [Acc > NIR] : 6.785e-13
##
## Kappa : 0.2564
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 3 Class: 4 Class: 5 Class: 6 Class: 7 Class: 8
## Sensitivity 0.00000 0.083333 0.5286 0.7557 0.23485 0.00000
## Specificity 1.00000 0.996476 0.8447 0.4455 0.95179 1.00000
## Pos Pred Value NaN 0.444444 0.5908 0.5264 0.51667 NaN
## Neg Pred Value 0.99591 0.969822 0.8086 0.6910 0.85004 0.96455
## Prevalence 0.00409 0.032720 0.2979 0.4492 0.17996 0.03545
## Detection Rate 0.00000 0.002727 0.1575 0.3395 0.04226 0.00000
## Detection Prevalence 0.00000 0.006135 0.2665 0.6449 0.08180 0.00000
## Balanced Accuracy 0.50000 0.539905 0.6866 0.6006 0.59332 0.50000
## Class: 9
## Sensitivity 0.0000000
## Specificity 0.9993179
## Pos Pred Value 0.0000000
## Neg Pred Value 0.9993179
## Prevalence 0.0006817
## Detection Rate 0.0000000
## Detection Prevalence 0.0006817
## Balanced Accuracy 0.4996589
#multinomial logistic regression model using mlogit()
library(mlogit)
## Loading required package: Formula
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: lmtest
H <- mlogit.data(winetrain, choice = "quality", shape = "wide")
mlogitmodel <- mlogit(quality ~ 1 | fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density + pH + sulphates + alcohol, data = H)
mlogitmodel
##
## Call:
## mlogit(formula = quality ~ 1 | fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density + pH + sulphates + alcohol, data = H, method = "nr")
##
## Coefficients:
## 4:(intercept) 5:(intercept) 6:(intercept)
## -2.8590e+02 -8.4188e+01 -9.5809e+00
## 7:(intercept) 8:(intercept) 9:(intercept)
## 5.4995e+02 7.8993e+02 3.1936e+03
## 4:fixed.acidity 5:fixed.acidity 6:fixed.acidity
## -1.1086e+00 -1.3841e+00 -1.4564e+00
## 7:fixed.acidity 8:fixed.acidity 9:fixed.acidity
## -9.0957e-01 -6.7954e-01 6.0766e+00
## 4:volatile.acidity 5:volatile.acidity 6:volatile.acidity
## 2.5623e-01 -3.5165e+00 -9.1329e+00
## 7:volatile.acidity 8:volatile.acidity 9:volatile.acidity
## -1.0969e+01 -1.0634e+01 -2.2988e+01
## 4:citric.acid 5:citric.acid 6:citric.acid
## 5.3941e-01 9.8451e-01 1.3139e+00
## 7:citric.acid 8:citric.acid 9:citric.acid
## 4.6225e-01 9.5835e-01 2.0964e+00
## 4:residual.sugar 5:residual.sugar 6:residual.sugar
## -1.3451e-01 1.8101e-02 1.0231e-01
## 7:residual.sugar 8:residual.sugar 9:residual.sugar
## 3.3452e-01 4.9144e-01 1.4790e+00
## 4:chlorides 5:chlorides 6:chlorides
## -9.8593e+00 -1.0064e+01 -9.5982e+00
## 7:chlorides 8:chlorides 9:chlorides
## -2.4082e+01 -1.5080e+01 -2.9459e+02
## 4:free.sulfur.dioxide 5:free.sulfur.dioxide 6:free.sulfur.dioxide
## -8.6527e-02 -5.3931e-02 -4.6736e-02
## 7:free.sulfur.dioxide 8:free.sulfur.dioxide 9:free.sulfur.dioxide
## -4.1125e-02 -2.3049e-02 -3.8749e-02
## 4:total.sulfur.dioxide 5:total.sulfur.dioxide 6:total.sulfur.dioxide
## 1.1786e-02 1.5538e-02 1.3735e-02
## 7:total.sulfur.dioxide 8:total.sulfur.dioxide 9:total.sulfur.dioxide
## 1.3421e-02 1.3333e-02 4.1507e-02
## 4:density 5:density 6:density
## 3.0779e+02 1.1458e+02 3.1067e+01
## 7:density 8:density 9:density
## -5.4652e+02 -8.0023e+02 -3.3686e+03
## 4:pH 5:pH 6:pH
## -3.3613e+00 -4.5219e+00 -3.9704e+00
## 7:pH 8:pH 9:pH
## -1.1829e+00 3.8343e-01 3.2454e+01
## 4:sulphates 5:sulphates 6:sulphates
## -9.4671e-01 -1.5138e+00 -6.8293e-02
## 7:sulphates 8:sulphates 9:sulphates
## 1.9959e+00 2.0382e+00 -2.3890e+00
## 4:alcohol 5:alcohol 6:alcohol
## 3.2991e-01 6.3759e-02 8.2709e-01
## 7:alcohol 8:alcohol 9:alcohol
## 7.7186e-01 8.9899e-01 2.8399e-01
summary(mlogitmodel)
##
## Call:
## mlogit(formula = quality ~ 1 | fixed.acidity + volatile.acidity +
## citric.acid + residual.sugar + chlorides + free.sulfur.dioxide +
## total.sulfur.dioxide + density + pH + sulphates + alcohol,
## data = H, method = "nr")
##
## Frequencies of alternatives:
## 3 4 5 6 7 8 9
## 0.0040804 0.0335179 0.2972894 0.4485573 0.1795395 0.0358496 0.0011658
##
## nr method
## 12 iterations, 0h:0m:4s
## g'(-H)^-1g = 0.000107
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## 4:(intercept) -2.8590e+02 5.0714e+02 -0.5637 0.5729275
## 5:(intercept) -8.4188e+01 5.0122e+02 -0.1680 0.8666090
## 6:(intercept) -9.5809e+00 5.0103e+02 -0.0191 0.9847435
## 7:(intercept) 5.4995e+02 5.1117e+02 1.0759 0.2819874
## 8:(intercept) 7.8993e+02 5.5591e+02 1.4210 0.1553261
## 9:(intercept) 3.1936e+03 1.9163e+03 1.6665 0.0956044 .
## 4:fixed.acidity -1.1086e+00 5.5301e-01 -2.0047 0.0449958 *
## 5:fixed.acidity -1.3841e+00 5.3692e-01 -2.5779 0.0099403 **
## 6:fixed.acidity -1.4564e+00 5.3686e-01 -2.7129 0.0066704 **
## 7:fixed.acidity -9.0957e-01 5.4489e-01 -1.6693 0.0950613 .
## 8:fixed.acidity -6.7954e-01 5.8350e-01 -1.1646 0.2441844
## 9:fixed.acidity 6.0766e+00 2.5999e+00 2.3373 0.0194262 *
## 4:volatile.acidity 2.5623e-01 2.2463e+00 0.1141 0.9091846
## 5:volatile.acidity -3.5165e+00 2.1945e+00 -1.6024 0.1090576
## 6:volatile.acidity -9.1329e+00 2.2216e+00 -4.1110 3.939e-05 ***
## 7:volatile.acidity -1.0969e+01 2.2839e+00 -4.8027 1.565e-06 ***
## 8:volatile.acidity -1.0634e+01 2.5123e+00 -4.2326 2.310e-05 ***
## 9:volatile.acidity -2.2988e+01 1.0971e+01 -2.0953 0.0361449 *
## 4:citric.acid 5.3941e-01 2.4573e+00 0.2195 0.8262508
## 5:citric.acid 9.8451e-01 2.3298e+00 0.4226 0.6726023
## 6:citric.acid 1.3139e+00 2.3343e+00 0.5629 0.5735092
## 7:citric.acid 4.6225e-01 2.3777e+00 0.1944 0.8458575
## 8:citric.acid 9.5835e-01 2.5127e+00 0.3814 0.7029060
## 9:citric.acid 2.0964e+00 3.8345e+00 0.5467 0.5845619
## 4:residual.sugar -1.3451e-01 1.9598e-01 -0.6863 0.4925030
## 5:residual.sugar 1.8101e-02 1.9067e-01 0.0949 0.9243664
## 6:residual.sugar 1.0231e-01 1.9067e-01 0.5366 0.5915607
## 7:residual.sugar 3.3452e-01 1.9449e-01 1.7200 0.0854345 .
## 8:residual.sugar 4.9144e-01 2.1134e-01 2.3254 0.0200528 *
## 9:residual.sugar 1.4790e+00 7.7338e-01 1.9123 0.0558322 .
## 4:chlorides -9.8593e+00 7.3695e+00 -1.3379 0.1809420
## 5:chlorides -1.0064e+01 6.2972e+00 -1.5982 0.1099888
## 6:chlorides -9.5982e+00 6.3392e+00 -1.5141 0.1299966
## 7:chlorides -2.4082e+01 7.8937e+00 -3.0508 0.0022824 **
## 8:chlorides -1.5080e+01 1.0542e+01 -1.4304 0.1525880
## 9:chlorides -2.9459e+02 1.3261e+02 -2.2215 0.0263155 *
## 4:free.sulfur.dioxide -8.6527e-02 1.7053e-02 -5.0739 3.898e-07 ***
## 5:free.sulfur.dioxide -5.3931e-02 1.4413e-02 -3.7419 0.0001826 ***
## 6:free.sulfur.dioxide -4.6736e-02 1.4278e-02 -3.2734 0.0010626 **
## 7:free.sulfur.dioxide -4.1125e-02 1.4529e-02 -2.8305 0.0046472 **
## 8:free.sulfur.dioxide -2.3049e-02 1.4793e-02 -1.5580 0.1192277
## 9:free.sulfur.dioxide -3.8749e-02 5.1578e-02 -0.7513 0.4524933
## 4:total.sulfur.dioxide 1.1786e-02 1.0212e-02 1.1541 0.2484437
## 5:total.sulfur.dioxide 1.5538e-02 9.8256e-03 1.5814 0.1137927
## 6:total.sulfur.dioxide 1.3735e-02 9.8243e-03 1.3980 0.1621021
## 7:total.sulfur.dioxide 1.3421e-02 9.9491e-03 1.3490 0.1773421
## 8:total.sulfur.dioxide 1.3333e-02 1.0535e-02 1.2655 0.2056754
## 9:total.sulfur.dioxide 4.1507e-02 2.4287e-02 1.7090 0.0874419 .
## 4:density 3.0779e+02 5.1585e+02 0.5967 0.5507324
## 5:density 1.1458e+02 5.0969e+02 0.2248 0.8221260
## 6:density 3.1067e+01 5.0950e+02 0.0610 0.9513787
## 7:density -5.4652e+02 5.1973e+02 -1.0515 0.2930098
## 8:density -8.0023e+02 5.6491e+02 -1.4166 0.1566110
## 9:density -3.3686e+03 1.9628e+03 -1.7162 0.0861305 .
## 4:pH -3.3613e+00 2.9866e+00 -1.1254 0.2604074
## 5:pH -4.5219e+00 2.8735e+00 -1.5737 0.1155645
## 6:pH -3.9704e+00 2.8709e+00 -1.3830 0.1666636
## 7:pH -1.1829e+00 2.9029e+00 -0.4075 0.6836633
## 8:pH 3.8343e-01 3.0575e+00 0.1254 0.9002001
## 9:pH 3.2454e+01 1.2807e+01 2.5341 0.0112738 *
## 4:sulphates -9.4671e-01 2.9211e+00 -0.3241 0.7458715
## 5:sulphates -1.5138e+00 2.7710e+00 -0.5463 0.5848641
## 6:sulphates -6.8293e-02 2.7635e+00 -0.0247 0.9802844
## 7:sulphates 1.9959e+00 2.7825e+00 0.7173 0.4731891
## 8:sulphates 2.0382e+00 2.8668e+00 0.7110 0.4771022
## 9:sulphates -2.3890e+00 7.9361e+00 -0.3010 0.7633903
## 4:alcohol 3.2991e-01 7.0319e-01 0.4692 0.6389531
## 5:alcohol 6.3759e-02 6.9200e-01 0.0921 0.9265888
## 6:alcohol 8.2709e-01 6.9183e-01 1.1955 0.2318886
## 7:alcohol 7.7186e-01 7.0244e-01 1.0988 0.2718459
## 8:alcohol 8.9899e-01 7.4923e-01 1.1999 0.2301810
## 9:alcohol 2.8399e-01 2.0220e+00 0.1404 0.8883046
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -3699.1
## McFadden R^2: 0.16555
## Likelihood ratio test : chisq = 1467.8 (p.value = < 2.22e-16)
##Look at percent correct
correct <- mlogitmodel$probabilities
binarycorrect <- colnames(correct)[apply(correct,1,which.max)]
table(winetrain$quality,binarycorrect)
## binarycorrect
## 4 5 6 7 8 9
## 3 0 7 6 0 1 0
## 4 6 64 44 1 0 0
## 5 3 537 475 5 0 0
## 6 2 296 1139 102 0 0
## 7 0 28 428 160 0 0
## 8 0 7 76 40 0 0
## 9 0 0 1 2 0 1